rm(list=ls()) 

###############################################################################
# EUREPOC INCIDENT PROCESSING SCRIPT
## creates following data files: 
## - eurepoc_year_attacks_incidents.RData
## - imputed_data_incidents.RData
###############################################################################

#load packages
library(tidyverse)
library(countrycode)
library(reshape2)
library(peacesciencer)
library(WDI)        
library(mice)
library(lubridate)
library(stringr)

#setwd 
setwd("/Users/cbj4/Desktop/replication")

# set seed 
# this seed must be set to 123 to get same coef in appendix after mice imputation
set.seed(123)

##############################
#STEP 1: DATA LOAD AND CLEAN
##############################

#read data
EuRepoC <- read.csv("raw_data/EuRepoC_Global_Database_1.1-Table 1.csv")

#fix names
EuRepoC$initiator_country <- str_replace_all(EuRepoC$initiator_country, "Korea, Republic of", "Republic of Korea")
EuRepoC$initiator_country <- str_replace_all(EuRepoC$initiator_country, "Korea, Democratic People's Republic of", "Democratic People's Republic of Korea")
EuRepoC$initiator_country <- str_replace_all(EuRepoC$initiator_country, "Iran, Islamic Republic of", "Iran")

EuRepoC$initiator_country <- gsub("['']", "", EuRepoC$initiator_country )
EuRepoC$initiator_country <- gsub("\\[|\\]", "", EuRepoC$initiator_country)

EuRepoC$receiver_country <- str_replace_all(EuRepoC$receiver_country, "Korea, Republic of", "Republic of Korea")
EuRepoC$receiver_country <- str_replace_all(EuRepoC$receiver_country, "Korea, Democratic People's Republic of", "Democratic People's Republic of Korea")
EuRepoC$receiver_country <- str_replace_all(EuRepoC$receiver_country, "Iran, Islamic Republic of", "Iran")

EuRepoC$receiver_country <- gsub("['']", "", EuRepoC$receiver_country )
EuRepoC$receiver_country <- gsub("\\[|\\]", "", EuRepoC$receiver_country)

#create separate year, month, day columns 
EuRepoC <- separate(EuRepoC, start_date, sep = "-", into = c("start_year", "start_month", "start_day"))
EuRepoC <- separate(EuRepoC, end_date, sep = "-", into = c("end_year", "end_month", "end_day"))

##############################
#STEP 2: FIX DATES
##############################

### not every column has a year, month, day provided - separate function fills these with NAs
### some observations have blank start or end dates - separate function leaves these blank
### some observations have dates coded incorrectly (sequentially end year is sequentially prior to end year)

#change blank year cells to NAs 
EuRepoC$start_year <-ifelse(EuRepoC$start_year == "", NA, EuRepoC$start_year)
EuRepoC$end_year <-ifelse(EuRepoC$end_year == "", NA, EuRepoC$end_year)

#replace NA with 2018 (in EUREPOC codebook NA is given when attack is still ongoing)
EuRepoC$end_year <- ifelse(is.na(EuRepoC$end_year), 2018, EuRepoC$end_year) 
#correct end years that occur before start years
EuRepoC$end_year <- ifelse(EuRepoC$end_year < EuRepoC$start_year, EuRepoC$start_year, EuRepoC$end_year) 

#subset observations with date problems 
EuRepoC_check1 <- subset(EuRepoC, start_year > end_year | is.na(start_year) | is.na(end_year))
rm(EuRepoC_check1)

### it looks like date missingness/problems at the year level is exclusive to 
### non-disruption attacks. I drop these observations.

#drop observations with start or end year NA
EuRepoC <- EuRepoC %>% drop_na(start_year)
EuRepoC <- EuRepoC %>% drop_na(end_year)

#drop observations with incorrect start end sequence
EuRepoC <- subset(EuRepoC, start_year < end_year | start_year == end_year)

##############################
#STEP 3: CREATE TARGET CCODES
##############################

#create separate attack columns 
cyber_dat_long <- separate(EuRepoC, 
                           initiator_country, sep = ",", 
                           into = c("attacker1", "attacker2", "attacker3", 
                                    "attacker4", "attacker5", "attacker6", "attacker7"))
### not every attacks has 5 attackers - separate function fills attack columns with with NAs

#create country codes for attack countries 
ccode_attack <- NA
ccode_matrix <- matrix(nrow = 1627, ncol = 7)

for(i in 1:7){
  i <- i + 25
  ccode_attack <- countrycode(cyber_dat_long[,i],"country.name", "cown", nomatch = 0)
  i <- i - 25
  ccode_matrix[,i] <- ccode_attack
}

# add matrix of attack ccodes into the long data 
cyber_dat_long <- cbind(cyber_dat_long, ccode_matrix)

#rename target ccode columns 
names(cyber_dat_long)[98:104] <- c("ccode_attack1", "ccode_attack2", "ccode_attack3",
                                   "ccode_attack4", "ccode_attack5", "ccode_attack6", "ccode_attack7")

#add Serbia attack ccode in manually 
cyber_dat_long$ccode_attack1 <- ifelse(cyber_dat_long$attacker1 == "Serbia", 345, cyber_dat_long$ccode_attack1)
cyber_dat_long$ccode_attack2 <- ifelse(cyber_dat_long$attacker2 == "Serbia", 345, cyber_dat_long$ccode_attack2)

### after checking there are no more ccodes that need to be manually changed for attackers

cyber_dat_long_dyad <- melt(cyber_dat_long, id.vars = c(1:97), 
                            measure.vars = c("ccode_attack1","ccode_attack2","ccode_attack3",
                                             "ccode_attack4", "ccode_attack5",  "ccode_attack6", 
                                             "ccode_attack7"))

#move dyad code to front of data and rename 
cyber_dat_long_dyad <- cyber_dat_long_dyad[,c(ncol(cyber_dat_long_dyad),1:(ncol(cyber_dat_long_dyad)-1))]
names(cyber_dat_long_dyad)[1] <- "ccode_attack"

#remove extra attack ccode column and drop all attacks without ccode info
cyber_dat_long_dyad <- subset(cyber_dat_long_dyad, ccode_attack != 0)

##############################
#STEP 4: CREATE ATTACK COUNT
##############################
cyber_DDY <- cyber_dat_long_dyad

#cyber_DDY <- subset(cyber_DDY, ccode_tar != 999)
#cyber_DDY <- subset(cyber_DDY, ccode_tar != 995)

#create attacker-year variable
cyber_DDY$attacker_year <- paste(cyber_DDY$ccode_attack, cyber_DDY$start_year, sep="_")
length(unique(cyber_DDY$attacker_year))

cyber_DDY <- subset(cyber_DDY, start_year < 2023)

#create an attack count for each unique attacker year
cyber_DDY$attack_bin <- rep(1)
unique_attacker_year <- aggregate(cyber_DDY$attack_bin, list(cyber_DDY$attacker_year), sum)
unique_attacker_year <- separate(unique_attacker_year, col = Group.1, 
                                 into = c("ccode_attack", "year"), sep = "_")
names(unique_attacker_year)[3] <- "attacks"


cyber_DDY$nonstate <- ifelse(grepl("Non-state", cyber_DDY$initiator_category), 1, 0)
cyber_DDY$nonstate <- ifelse(grepl("Individual", cyber_DDY$initiator_category), 1, cyber_DDY$nonstate)
#cyber_DDY$nonstate <- ifelse(grepl("Unknown", cyber_DDY$initiator_category), 1, cyber_DDY$nonstate)

cyber_DDY$state <- ifelse(grepl("State", cyber_DDY$initiator_category), 1, 0)
cyber_DDY$state <- ifelse(grepl("state-affiliated", cyber_DDY$inclusion_criteria_subcode), 1, cyber_DDY$state)
cyber_DDY$state <- ifelse(grepl("state-affiliation", cyber_DDY$initiator_category), 1, cyber_DDY$state)
cyber_DDY$state <- ifelse(grepl("state-attribution", cyber_DDY$inclusion_criteria), 1, cyber_DDY$state)

cyber_DDY$INTRA_ANTIGOV[is.na(cyber_DDY$INTRA_ANTIGOV)] <- 0
cyber_DDY$INTRA[is.na(cyber_DDY$INTRA)] <- 0


# Reviewer has asked about whether intrastate attacks change results 
#cyber_DDY <- subset(cyber_DDY, INTRA != 1)
cyber_DDY <- subset(cyber_DDY, nonstate == 1)
cyber_DDY <- subset(cyber_DDY, state != 1)

cyber_DDY$disrupt_bin <- rep(1)
unique_attacker_year_disrupt <- aggregate(cyber_DDY$disrupt_bin, list(cyber_DDY$attacker_year), sum)
unique_attacker_year_disrupt <- separate(unique_attacker_year_disrupt, col = Group.1, 
                                         into = c("ccode_attack", "year"), sep = "_")
names(unique_attacker_year_disrupt)[3] <- "disruption_attacks"



###############################################
# STEP 5: CREATE BASE COUNTRY-YEAR ATTACK DATA
###############################################

#create country-year base
country_base <- create_stateyears(system = "cow", mry = T) %>%
  filter(between(year, 2000, 2022)) 

#%>% add_cow_majors() 

### this function has a bug 
### it adds 1s for every country as a COW MAJ in 2017 and 2018

#fix cow major dummy for 2017 and 2018 
#country_base$cowmaj <- ifelse(country_base$year == 2017, 0, country_base$cowmaj)
#country_base$cowmaj <- ifelse(country_base$year == 2018, 0, country_base$cowmaj)
#country_base$cowmaj <- ifelse(country_base$ccode == 2 |country_base$ccode == 200 |
#country_base$ccode == 222 |country_base$ccode == 255 |
#country_base$ccode == 365 |country_base$ccode == 710 |
#country_base$ccode == 740, 1, country_base$cowmaj)

# create time until the next election variable
country_base$time_to_elect <- rep(0)
country_base$time_to_elect <- ifelse(country_base$year == 2021 | country_base$year == 2017 | country_base$year == 2013 | 
                                       country_base$year == 2009 | country_base$year == 2005 |
                                       country_base$year == 2001, 1, country_base$time_to_elect)
country_base$time_to_elect <- ifelse(country_base$year == 2020 |
                                       country_base$year == 2016 | country_base$year == 2012 | 
                                       country_base$year == 2008 | country_base$year == 2004 |
                                       country_base$year == 2000, 2, country_base$time_to_elect)
country_base$time_to_elect <- ifelse(country_base$year == 2019 |
                                       country_base$year == 2015 | country_base$year == 2011 | 
                                       country_base$year == 2007 | country_base$year == 2003, 
                                     3, country_base$time_to_elect)

#create dummy variable for each time to elect category (1-4)
country_base$T_0 <- ifelse(country_base$time_to_elect == 0, 1, 0)
country_base$T_1 <- ifelse(country_base$time_to_elect == 1, 1, 0)
country_base$T_2 <- ifelse(country_base$time_to_elect == 2, 1, 0)
country_base$T_3 <- ifelse(country_base$time_to_elect == 3, 1, 0)

#create T variable (linear time)
country_base$time <- (country_base$year + 1) - 2000

#merge base with attack counts 
country_year_attacks <- merge(country_base, unique_attacker_year, 
                              by.x = c("ccode", "year"), by.y = c("ccode_attack", "year"), 
                              all.x = T)
country_year_attacks <- merge(country_year_attacks, unique_attacker_year_disrupt, 
                              by.x = c("ccode", "year"), by.y = c("ccode_attack", "year"), 
                              all.x = T)
country_year_attacks$attacks[is.na(country_year_attacks$attacks)] <- 0
country_year_attacks$disruption_attacks[is.na(country_year_attacks$disruption_attacks)] <- 0

# read in percent usage (control) for every country year 
ITU_percent <- read.csv("raw_data/ITU_percent.csv")

#add ccodes to percent usage data
ITU_percent$ccode <- countrycode(ITU_percent$Economy, 
                                 origin = "country.name", destination = "cown", 
                                 nomatch = 0)

# manually add ccode for countries not matched by function 
ITU_percent$ccode <- ifelse(ITU_percent$Economy == "Serbia", 345, ITU_percent$ccode)
ITU_percent$ccode <- ifelse(ITU_percent$Economy == "Türkiye", 640, ITU_percent$ccode)

#transform percent usage from wide to long
ITU_percent_long <- melt(ITU_percent, id.vars = c(1:69), 
                         measure.vars = c("X2000_value", "X2001_value", 
                                          "X2002_value", "X2003_value", 
                                          "X2004_value", "X2005_value", 
                                          "X2006_value", "X2007_value",
                                          "X2008_value", "X2009_value", 
                                          "X2010_value", "X2011_value", 
                                          "X2012_value", "X2013_value", 
                                          "X2014_value", "X2015_value", 
                                          "X2016_value", "X2017_value", 
                                          "X2018_value", "X2019_value", 
                                          "X2020_value", "X2021_value"))

#create a year variable from the column names
ITU_percent_long$year <- as.numeric(gsub('[X_value]','',ITU_percent_long$variable))
ITU_percent_long <- subset(ITU_percent_long, select = c(69,71,72))
names(ITU_percent_long)[2] <- "percent_usage"

#merge percent usage with base country-year attacks data
country_year_attacks <- merge(country_year_attacks, ITU_percent_long, by = c("ccode", "year"), all.x = T)


##########################################
# STEP 6: CREATE ITU CANDIDATE TREATMENT
#########################################

# read in ITU council and ITU radio board data
ITU <- read.csv("raw_data/ITU.csv")

#RB_ITU <- read.csv("RB_ITU.csv")
RB_ITU <- read.csv("raw_data/RB_ITU.csv")

names(RB_ITU)[1] <- "state"
names(RB_ITU)[4] <- "elect_year"

#start with elect year 2002
ITU <- subset(ITU, elect_year > 2000)
RB_ITU <- subset(RB_ITU, elect_year > 2000)
ITU_merge <- rbind(ITU, RB_ITU)

#add ccodes to ITU data
ITU_merge$ccode <- countrycode(ITU_merge$state, 
                               "country.name", "cown", nomatch = 0)

#add ccodes for states not matched by function
ITU_merge$ccode <- ifelse(ITU_merge$state == "United  States", 2, ITU_merge$ccode)
ITU_merge$ccode <- ifelse(ITU_merge$state == "Serbia", 345, ITU_merge$ccode)

#create unique ITU candidate years 
ITU_merge$ccode_year <- paste(ITU_merge$ccode, ITU_merge$elect_year, sep="_")
unique_ITU_year <- as.data.frame(table(ITU_merge$ccode_year))
unique_ITU_year <- separate(unique_ITU_year, col = Var1, into = c("ccode", "year"), 
                            sep = "_")
unique_ITU_year <- subset(unique_ITU_year, select = c(1,2))
unique_ITU_year$candidate <- rep(1)

#expand candidate years to include years prior to election and create candidate binary
#for merge into base data

unique_ITU_year$year <- as.numeric(unique_ITU_year$year)

year_subtract1 <- function(y, x){
  ccode <- as.data.frame(print(y))
  year <- as.data.frame(print(x - 1))
  df1 <- data.frame(ccode, year)
  colnames(df1) <- c("ccode","year")
  return(df1)
}

df1 <- year_subtract1(unique_ITU_year$ccode, unique_ITU_year$year)
df1$candidate <- rep(1)

year_subtract2 <- function(y, x){
  ccode <- as.data.frame(print(y))
  year <- as.data.frame(print(x - 2))
  df2 <- data.frame(ccode, year)
  colnames(df2) <- c("ccode","year")
  return(df2)
}

df2 <- year_subtract2(unique_ITU_year$ccode, unique_ITU_year$year)
df2$candidate <- rep(1)

year_subtract3 <- function(y, x){
  ccode <- as.data.frame(print(y))
  year <- as.data.frame(print(x - 3))
  df3 <- data.frame(ccode, year)
  colnames(df3) <- c("ccode","year")
  return(df3)
}

df3 <- year_subtract3(unique_ITU_year$ccode, unique_ITU_year$year)
df3$candidate <- rep(1)

#combine all years into one data set and exclude years prior to 2000
SY_ITU_dat <- rbind(unique_ITU_year, df1, df2, df3)
SY_ITU_dat <- subset(SY_ITU_dat, year > 1999)

#merge ITU candidate status into country year attack base data 
country_year_attacks <- merge(country_year_attacks, SY_ITU_dat, by = c("ccode", "year"), all.x = T)
country_year_attacks$candidate[is.na(country_year_attacks$candidate)] <- 0

#create spell factors for within analysis 
country_year_attacks$spell <- ifelse(country_year_attacks$time < 4, 1, 0)
country_year_attacks$spell <- ifelse(country_year_attacks$time < 8 & country_year_attacks$time > 3 , 2, country_year_attacks$spell)
country_year_attacks$spell <- ifelse(country_year_attacks$time < 12 & country_year_attacks$time > 7 , 3, country_year_attacks$spell)
country_year_attacks$spell <- ifelse(country_year_attacks$time < 16 & country_year_attacks$time > 11 , 4, country_year_attacks$spell)
country_year_attacks$spell <- ifelse(country_year_attacks$time < 20 & country_year_attacks$time > 15 , 5, country_year_attacks$spell)
country_year_attacks$spell <- ifelse(country_year_attacks$time > 19, 6, country_year_attacks$spell)

country_year_attacks$country_spell <- paste(country_year_attacks$ccode, country_year_attacks$spell, sep = "_")
country_year_attacks$country_spell <- factor(country_year_attacks$country_spell)

#################################################################################
# STEP 7: CREATE FINAL "eurepoc_year_attacks_incidents.RData" DATA SAVE AS RData
#################################################################################

eurepoc_year_data <- country_year_attacks
eurepoc_year_data$election <- eurepoc_year_data$T_0
eurepoc_year_data <- subset(eurepoc_year_data, select = -c(attacks))
names(eurepoc_year_data)[10] <- "attacks"

eurepoc_year_data_startyear <- eurepoc_year_data

#create all country data 
eurepoc_year_data <- subset(eurepoc_year_data, year < 2019)
eurepoc_year_data$treat <- ifelse(eurepoc_year_data$candidate == 1 & eurepoc_year_data$election == 1, 1, 0)

#create within candidate data 
SY_within <- subset(eurepoc_year_data, candidate == 1)
SY_within <- subset(SY_within, year < 2019)

eurepoc_year_data_incident <- eurepoc_year_data

#save(eurepoc_year_data_incident, file = "~/My Drive/Cyber proxies - ITU /R&R/eurepoc_year_attacks_incidents.RData")
#save(eurepoc_year_data_incident, file = "data/eurepoc_year_attacks_incidents.RData")

#########################
# STEP 8: CHECK MODEL 1
##########################

spell_fix_linear2_delete <- lm(attacks ~ treat + factor(country_spell), data = eurepoc_year_data)
summary(spell_fix_linear2_delete)

####################################
# STEP 9: ADD CONTROLS W/ IMPUTATION
####################################

# create country year base 
ccode_years <- create_stateyears(system = "cow", mry = T) %>%
  filter(between(year, 2000, 2018)) 

ccode_vector <- unique(eurepoc_year_data$ccode)
ccode_years <- ccode_years[(ccode_years$ccode %in% ccode_vector),]

length(unique(ccode_years$ccode))

# add model variables into base
model <- subset(eurepoc_year_data, select = c("ccode", "year", "candidate", "attacks", "spell", "country_spell", 
                                              "T_0", "T_1", "T_2", "T_3", "election", "time", "time_to_elect",
                                              "treat"))

ccode_years <- merge(ccode_years, model, by = c("ccode", "year"), all.x = T)

ccode_years$candidate[is.na(ccode_years$candidate)] <- 0

table(ccode_years$candidate)

#add WDI controls data 
controls <- WDI(
  country = "all",
  indicator = c("NY.GDP.PCAP.KD","BX.GSR.CCIS.ZS", "TM.VAL.ICTG.ZS.UN", "IP.JRN.ARTC.SC", "IT.NET.USER.ZS", 
                "IT.NET.BBND.P2", "IT.CEL.SETS.P2", "SI.POV.DDAY"),
  start = 2000,
  end = 2018,
  extra = FALSE,
  cache = NULL,
  latest = NULL,
  language = "en"
)

controls$ccode <- countrycode(controls$iso3c, "iso3c", "cown")
controls$ccode <- ifelse(controls$iso3c == "SRB", 345, controls$ccode)

controls <- subset(controls, select = c(4:13))
names(controls)[2:9] <- c("gdp_percap", "ict_exports", "ict_imports", "tech_articles", "internet_usage", "broadband_subs", "mobile_subs", "extreme_poverty")

# Add international bandwidth 
int_bandwidth <- read.csv("raw_data/international-bandwidth.csv")

int_bandwidth$ccode <- countrycode(int_bandwidth$isoCode, origin = "iso3c", destination = "cown")
int_bandwidth$ccode <- ifelse(int_bandwidth$isoCode == "SRB", 345, int_bandwidth$ccode)

names(int_bandwidth)[8] <- "year"
int_bandwidth <- subset(int_bandwidth, select = c(8,11, 12))
names(int_bandwidth)[2] <- "bandwidth"

# Add annual investment in tele services 
tele_invest <- read.csv("~/My Drive/Cyber proxies - ITU /Data - Controls/annual-investment-in-telecommunication-services.csv")

tele_invest$ccode <- countrycode(tele_invest$isoCode, origin = "iso3c", destination = "cown")
tele_invest$ccode <- ifelse(tele_invest$isoCode == "SRB", 345, tele_invest$ccode)

names(tele_invest)[8] <- "year"
tele_invest <- subset(tele_invest, select = c(8,11, 12))
names(tele_invest)[2] <- "tele_invest"

# merge with controls 
ccode_years <- merge(ccode_years, controls, by = c("ccode", "year"), all.x = T)
ccode_years <- merge(ccode_years, int_bandwidth, by = c("ccode", "year"), all.x = T)
ccode_years <- merge(ccode_years, tele_invest, by = c("ccode", "year"), all.x = T)

# CREATE IMPUTED VALUES  
predM <- quickpred(ccode_years, exclude = c("ccode","year", "statenme", "candidate", "spell", "election", "treat", 
                                            "country_spell", "T_0", "T_1", "T_2", "T_3", "time", "time_to_elect"),
                   method = "pearson")


impute_data <- ccode_years
impute_data$ccode <- factor(impute_data$ccode)
impute_data$year <- factor(impute_data$year)

#imp0 <- mice(impute_data,maxit = 0)

#predM <- imp0$predictorMatrix

#predM[, c("ccode")] <- 0
#predM[, c("year")] <- 0
#predM[, c("time_to_elect")] <- 0
#predM[, c("T_0")] <- 0
#predM[, c("T_1")] <- 0
#predM[, c("T_2")] <- 0
#predM[, c("T_3")] <- 0
#predM[, c("time")] <- 0
#predM[, c("attacks")] <- 0
#predM[, c("spell")] <- 0
#predM[, c("country_spell")] <- 0
#predM[, c("candidate")] <- 0
#predM[, c("election")] <- 0
#predM[, c("statenme")] <- 0
#predM[, c("broadband_subs")] <- 0

imp1 <- mice(impute_data, maxit = 5, 
             predictorMatrix = predM, print =  TRUE, method = "cart", remove_collinear = TRUE )

imp1$loggedEvents
#summary(imp1) 
d.long.incidents <- complete(imp1,"long",include = T)

d.long.incidents$log_tech_articles <- log(d.long.incidents$tech_articles + 1)

#################################################################################
# STEP 10: CREATE FINAL "imputed_data_incidents.RData" DATA SAVE AS RData
#################################################################################

#save(d.long.incidents, file = "~/My Drive/Cyber proxies - ITU /R&R/imputed_data_incidents.RData")
#save(d.long.incidents, file = "data/imputed_data_incidents.RData")

#########################
# STEP 11: CHECK MODEL 2
##########################

imp.candidates.incidents <- as.mids(d.long.incidents)

fit.A <- with(imp.candidates.incidents, lm(attacks ~ treat + factor(country_spell) +
                                             internet_usage + ict_exports + log_tech_articles))
m2 <- pool(fit.A)
summary(m2)

